Abstract
GPCs were derived from hESCs, and FACs-sorted for CD140. One day after sort, GPCs were tranduced with viral particles carrying exogenous ZNF441 or EGFP as control (MOI=5). Cells were collected after 12 days and processed through 10x capture and library preparation. This vignette describes the analysis carried out after a count matrix had been obtained either by cellRanger or STARsolo.suppressPackageStartupMessages(library(Seurat))
suppressPackageStartupMessages(library(sctransform))
suppressPackageStartupMessages(library(ggplot2))
suppressPackageStartupMessages(library(DT))
suppressPackageStartupMessages(library(knitr))
suppressPackageStartupMessages(library(dplyr))
options(future.globals.maxSize = 3000 * 1024^2)
set.seed(123)# Color for cell type assignment
main_color <- c(RColorBrewer::brewer.pal(8, "Set3")[c(1, 8)], RColorBrewer::brewer.pal(6,
"Set1"), "magenta3")Read in data
# Matrices can be downloaded from GSE184517. After decompression, data directory
# should contain three matrices per sample
list.files("./data/scRNAseq/Sample_1_G19_ZNF441_GFP_matrices", recursive = TRUE)## [1] "barcodes.tsv.gz" "features.tsv.gz" "matrix.mtx.gz"
## [1] "barcodes.tsv.gz" "features.tsv.gz" "matrix.mtx.gz"
Data directory: Sample_1_G19_ZNF441_GFP_matrices
## [1] "OVX"
## [1] "Sample_1_G19_ZNF441_GFP_matrices"
# Read in 10X
sample.counts <- Read10X(paste0("./data/scRNAseq/", main_dir))
# Create Seurat object
object <- CreateSeuratObject(counts = sample.counts, min.cells = 3, min.features = 500,
project = sample_name)# Integration of transduction data. It will be the EGFP and exZNF441 genes. We
# subset this out into a different assay, so that it will not interfere with our
# downstream analysis of cell cluster
subset <- subset(object, features = c("EGFP", "exZNF441"))[["RNA"]]@counts
# Remove EGFP and exZNF441 from object
DefaultAssay(object) <- "RNA"
object <- subset(object, features = row.names(object[["RNA"]]@counts)[!row.names(object[["RNA"]]@counts) %in%
c("EGFP", "exZNF441")])
# Add back the EGFP and exZNF441 into a different assay
object[["LV"]] <- CreateAssayObject(counts = subset)Filter to exclude low-quality cells and multiplets
object[["percent.mt"]] <- PercentageFeatureSet(object, pattern = "^MT-")
# Data at a glance, before filtering
p1 <- VlnPlot(object, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3,
pt.size = 0.01)For sample OVX, we exclude cells with high mitochondrial content (20%). High gene counts and transcript counts could be indicative of doublets. Therefore, these cells were excluded as well: > 8500 gene counts and 4.510^{4} transcript counts.
# Exclude cells
object <- subset(object, subset = nFeature_RNA > 500 & nFeature_RNA < highgene.cutoff &
percent.mt < mito.cutoff & nCount_RNA < highcount.cutoff)
# Data at a glance, after filtering
p2 <- VlnPlot(object, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3,
pt.size = 0.01)
cowplot::plot_grid(NULL, p1, NULL, p2, labels = c("Before Filtering", "", "After Filtering",
""), rel_heights = c(0.1, 1, 0.1, 1), ncol = 1)# Cell cycle markers are loaded with library Seurat
s.genes = cc.genes$s.genes
g2m.genes = cc.genes$g2m.genes
# Rename some of the genes to current convention
s.genes[s.genes == "MLF1IP"] <- "CENPU"
g2m.genes[g2m.genes == "FAM64A"] <- "PIMREG"
g2m.genes[g2m.genes == "HN1"] <- "JPT1"
object <- CellCycleScoring(object, s.features = s.genes, g2m.features = g2m.genes,
set.ident = FALSE)## [1] "RNA"
Number of PCs used for downstream analysis is 14
## [1] "SCT"
## [1] 14
set.seed(123)
object <- RunUMAP(object, dims = 1:nPCs, verbose = FALSE, umap.method = "umap-learn")
DimPlot(object, group.by = "Phase")# Only run this for EGFP control sample
DefaultAssay(object) <- "SCT"
#
object <- FindNeighbors(object, dims = 1:nPCs, verbose = FALSE)
# Resolution used for cluster identification is 0.5 for CTR and 1 for OVX
RES## [1] 1
object <- FindClusters(object, resolution = RES, verbose = FALSE)
DimPlot(object, label = TRUE) + NoLegend() + ggtitle(paste0("Clustered with resolution ",
RES))viral_counts <- object[["LV"]]@counts
# A cell is counted as a transduced cells if it has non-zero expression of EGFP
# or exZNF441
x1 <- colnames(viral_counts)[which(viral_counts[1, ] != 0)]
if (sample_name == "OVX") {
x2 <- colnames(viral_counts)[which(viral_counts[2, ] != 0)]
}
# Nr of cells with EGFP expression
length(x1)## [1] 2134
if (sample_name == "OVX") {
# Nr of cells with exZNF441 expression
print(length(x2))
# Nr of cells with BOTH expression
print(length(intersect(x1, x2)))
# Nr of cells with EITHER expression
print(length(union(x1, x2)))
} else {
x2 <- NULL
}## [1] 1667
## [1] 1517
## [1] 2284
# Identity Assign as 'non-transduced' if no LV was detected
object$transduced <- ifelse(row.names(object@meta.data) %in% union(x1, x2), "transduced",
"non-transduced")Save object for downstream integrated analysis
Data directory: Sample_2_G19_GFPonly_matrices
## [1] "CTR"
## [1] "Sample_2_G19_GFPonly_matrices"
# Read in 10X
sample.counts <- Read10X(paste0("./data/scRNAseq/", main_dir))
# Create Seurat object
object <- CreateSeuratObject(counts = sample.counts, min.cells = 3, min.features = 500,
project = sample_name)# Integration of transduction data. It will be the EGFP and exZNF441 genes. We
# subset this out into a different assay, so that it will not interfere with our
# downstream analysis of cell cluster
subset <- subset(object, features = c("EGFP", "exZNF441"))[["RNA"]]@counts
# Remove EGFP and exZNF441 from object
DefaultAssay(object) <- "RNA"
object <- subset(object, features = row.names(object[["RNA"]]@counts)[!row.names(object[["RNA"]]@counts) %in%
c("EGFP", "exZNF441")])
# Add back the EGFP and exZNF441 into a different assay
object[["LV"]] <- CreateAssayObject(counts = subset)Filter to exclude low-quality cells and multiplets
object[["percent.mt"]] <- PercentageFeatureSet(object, pattern = "^MT-")
# Data at a glance, before filtering
p1 <- VlnPlot(object, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3,
pt.size = 0.01)For sample CTR, we exclude cells with high mitochondrial content (20%). High gene counts and transcript counts could be indicative of doublets. Therefore, these cells were excluded as well: > 9000 gene counts and 4.510^{4} transcript counts.
# Exclude cells
object <- subset(object, subset = nFeature_RNA > 500 & nFeature_RNA < highgene.cutoff &
percent.mt < mito.cutoff & nCount_RNA < highcount.cutoff)
# Data at a glance, after filtering
p2 <- VlnPlot(object, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3,
pt.size = 0.01)
cowplot::plot_grid(NULL, p1, NULL, p2, labels = c("Before Filtering", "", "After Filtering",
""), rel_heights = c(0.1, 1, 0.1, 1), ncol = 1)# Cell cycle markers are loaded with library Seurat
s.genes = cc.genes$s.genes
g2m.genes = cc.genes$g2m.genes
# Rename some of the genes to current convention
s.genes[s.genes == "MLF1IP"] <- "CENPU"
g2m.genes[g2m.genes == "FAM64A"] <- "PIMREG"
g2m.genes[g2m.genes == "HN1"] <- "JPT1"
object <- CellCycleScoring(object, s.features = s.genes, g2m.features = g2m.genes,
set.ident = FALSE)## [1] "RNA"
Number of PCs used for downstream analysis is 12
## [1] "SCT"
## [1] 12
set.seed(123)
object <- RunUMAP(object, dims = 1:nPCs, verbose = FALSE, umap.method = "umap-learn")
DimPlot(object, group.by = "Phase")# Only run this for EGFP control sample
DefaultAssay(object) <- "SCT"
#
object <- FindNeighbors(object, dims = 1:nPCs, verbose = FALSE)
# Resolution used for cluster identification is 0.5 for CTR and 1 for OVX
RES## [1] 0.5
object <- FindClusters(object, resolution = RES, verbose = FALSE)
DimPlot(object, label = TRUE) + NoLegend() + ggtitle(paste0("Clustered with resolution ",
RES))viral_counts <- object[["LV"]]@counts
# A cell is counted as a transduced cells if it has non-zero expression of EGFP
# or exZNF441
x1 <- colnames(viral_counts)[which(viral_counts[1, ] != 0)]
if (sample_name == "OVX") {
x2 <- colnames(viral_counts)[which(viral_counts[2, ] != 0)]
}
# Nr of cells with EGFP expression
length(x1)## [1] 3191
if (sample_name == "OVX") {
# Nr of cells with exZNF441 expression
print(length(x2))
# Nr of cells with BOTH expression
print(length(intersect(x1, x2)))
# Nr of cells with EITHER expression
print(length(union(x1, x2)))
} else {
x2 <- NULL
}
# Identity Assign as 'non-transduced' if no LV was detected
object$transduced <- ifelse(row.names(object@meta.data) %in% union(x1, x2), "transduced",
"non-transduced")Save object for downstream integrated analysis
Load pre-computed objects from PART1
Manually assign cell type identity for sample CTR. These are based on the expression of canonical cell type markers.
ctr$CellType <- plyr::mapvalues(ctr$SCT_snn_res.0.5, c(0:10), c("NSC", "Pre_GPC_1",
"NSC", "Pre_GPC_1", "Astrocyte_2", "Astrocyte_1", "Pre_GPC_2", "GPC", "NSC",
"Reactive_Astrocyte", "Astrocyte_2"))
ctr$CellType <- as.character(ctr$CellType)
ctr$CellType <- factor(ctr$CellType, levels = c("NSC", "Pre_GPC_1", "Pre_GPC_2",
"GPC", "Astrocyte_1", "Astrocyte_2", "Reactive_Astrocyte"))Expression of cell type markers in CTR
main_colorx <- main_color[c(1, 2, 5:9)]
ctr <- SetIdent(ctr, value = "CellType")
DefaultAssay(ctr) <- "RNA"
# Canonical markers
markers.list <- list(Astrocyte = c(CD44 = 3, GFAP = 3, AQP4 = 3, BMPR1B = 3), GPC = c(OLIG1 = 4,
OLIG2 = 2, CNP = 2, PDGFRA = 4), NSC = c(DCX = 5, DLX5 = 4.5, DLX1 = 4, DLX2 = 4),
Pre_GPC = c(PAX6 = 4, CAMK2N1 = 4, SOX9 = 4), Reactive = c(FAM89A = 2, CCDC69 = 2,
lv_EGFP = 2000))
p <- list()
for (m in names(markers.list)) {
markers_of_interest <- markers.list[[m]]
p[[m]] <- list()
for (i in 1:length(markers_of_interest)) {
p[[m]][[i]] <- VlnPlot(ctr, names(markers_of_interest)[i], cols = main_colorx,
pt.size = 0) + NoLegend() + ylim(0, markers_of_interest[i]) + labs(x = "")
}
}
p1 <- cowplot::plot_grid(plotlist = p[["NSC"]], ncol = 4)
p2 <- cowplot::plot_grid(plotlist = p[["Pre_GPC"]], ncol = 4)
p3 <- cowplot::plot_grid(plotlist = p[["GPC"]], ncol = 4)
p4 <- cowplot::plot_grid(plotlist = p[["Astrocyte"]], ncol = 4)
p5 <- cowplot::plot_grid(plotlist = p[["Reactive"]], ncol = 4)
cowplot::plot_grid(p1, p2, p3, p4, p5, ncol = 1)Use of Seurat label transfer and information gained from SCT cluster to assign OVX cell types
set.seed(2021)
DefaultAssay(ctr) <- "RNA"
DefaultAssay(ovx) <- "RNA"
ctr <- FindVariableFeatures(ctr)
anchors <- FindTransferAnchors(reference = ctr, query = ovx)
predictions <- TransferData(anchorset = anchors, refdata = ctr$CellType)pred <- predictions$predicted.id
# Change a subset of Pre_GPC into Dividing Pre_GPC based on their cell cycle
# scoring
cowplot::plot_grid(DimPlot(ovx, group.by = "Phase"), DimPlot(ovx, group.by = "SCT_snn_res.1",
label = TRUE) + NoLegend(), ncol = 2, rel_widths = c(1, 0.9))for (i in 1:length(pred)) {
if (ovx$SCT_snn_res.1[i] == 13 & pred[i] == "Pre_GPC_1") {
pred[i] <- "Pre_GPC_1_G2M"
} else if (ovx$SCT_snn_res.1[i] == 15 & pred[i] == "Pre_GPC_1") {
pred[i] <- "Pre_GPC_1_S"
}
}
ovx$CellType <- factor(pred, levels = c("NSC", "Pre_GPC_1", "Pre_GPC_1_G2M", "Pre_GPC_1_S",
"Pre_GPC_2", "GPC", "Astrocyte_1", "Astrocyte_2", "Reactive_Astrocyte"))Cell type marker expression in OVX
ovx <- SetIdent(ovx, value = "CellType")
# Canonical markers
markers.list <- list(Astrocyte = c("CD44", "GFAP", "AQP4", "BMPR1B"), GPC = c("OLIG1",
"OLIG2", "CNP", "PDGFRA"), NSC = c("DCX", "DLX5", "DLX1", "DLX2"), Pre_GPC = c("PAX6",
"CAMK2N1", "SOX9"), Reactive = c("FAM89A", "CCDC69", "lv_EGFP"))
p <- list()
for (m in names(markers.list)) {
markers_of_interest <- markers.list[[m]]
p[[m]] <- list()
for (i in 1:length(markers_of_interest)) {
# Note how we DID NOT have ylim for ovx samples
p[[m]][[i]] <- VlnPlot(ovx, markers_of_interest[i], cols = main_color, pt.size = 0) +
NoLegend() + labs(x = "")
}
}
p1 <- cowplot::plot_grid(plotlist = p[["NSC"]], ncol = 4)
p2 <- cowplot::plot_grid(plotlist = p[["Pre_GPC"]], ncol = 4)
p3 <- cowplot::plot_grid(plotlist = p[["GPC"]], ncol = 4)
p4 <- cowplot::plot_grid(plotlist = p[["Astrocyte"]], ncol = 4)
p5 <- cowplot::plot_grid(plotlist = p[["Reactive"]], ncol = 4)
cowplot::plot_grid(p1, p2, p3, p4, p5, ncol = 1)object.list <- list(OVX = ovx, CTR = ctr)
object.list <- lapply(object.list, function(x) {
DefaultAssay(x) <- "SCT"
return(x)
})
features <- SelectIntegrationFeatures(object.list, nfeatures = 3000)
object.list <- PrepSCTIntegration(object.list, anchor.features = features)
anchors <- FindIntegrationAnchors(object.list, normalization.method = "SCT", anchor.features = features,
verbose = FALSE)
integrated_object <- IntegrateData(anchorset = anchors, normalization.method = "SCT",
)# Read in cc markers
s.genes = cc.genes$s.genes
s.genes[s.genes == "MLF1IP"] <- "CENPU"
g2m.genes = cc.genes$g2m.genes
g2m.genes[g2m.genes == "FAM64A"] <- "PIMREG"
g2m.genes[g2m.genes == "HN1"] <- "JPT1"
DefaultAssay(integrated_object) <- "RNA"
integrated_object <- CellCycleScoring(integrated_object, s.features = s.genes, g2m.features = g2m.genes,
set.ident = FALSE)Number of PCs used for downstream analysis is 15
integrated_object$CellType <- as.character(integrated_object$CellType)
integrated_object$CellType <- factor(integrated_object$CellType, levels = c("NSC",
"Pre_GPC_1", "Pre_GPC_1_G2M", "Pre_GPC_1_S", "Pre_GPC_2", "GPC", "Astrocyte_1",
"Astrocyte_2", "Reactive_Astrocyte"))
integrated_object <- SetIdent(integrated_object, value = "CellType")
DimPlot(integrated_object, split.by = "orig.ident", cols = main_color)For the CTR, all transduced cells should be considered as control. For the OVX, transduced cells are overexpressed with ZNF441, and non-transduced cells should be considered as control
DefaultAssay(integrated_object) <- "RNA"
integrated_object$LV_exZNF441 <- ifelse(integrated_object$orig.ident == "CTR", "non-transduced",
integrated_object$transduced)
table(integrated_object$LV_exZNF441)##
## non-transduced transduced
## 3450 2284
Within each cell type, what are the differentially expressed genes between transduced (with ZNF441) vs non-transduced?
integrated_object <- SetIdent(integrated_object, value = "CellType")
deg <- list()
for (ct in levels(integrated_object$CellType)[c(1, 2, 5:8)]) {
# Excluding the reactive astrocytes and dividing GPCs since there were no
# appropriate counterparts
deg[[ct]] <- FindMarkers(integrated_object, ident.1 = "transduced", group.by = "LV_exZNF441",
subset.ident = ct, logfc.threshold = 0)
}Showing the differentially expressed genes by cell type
res_filt <- lapply(deg, function(x) x[x$p_val_adj < 0.1, ])
res_filt <- lapply(res_filt, function(x) {
x$geneName <- row.names(x)
return(x)
})
res_filt <- do.call(rbind, res_filt)
res_filt$CellType <- gsub("\\..*", "", row.names(res_filt))
row.names(res_filt) <- NULLhtmltools::tagList(datatable(res_filt, extensions = "Buttons", options = list(dom = "Bfrtip",
buttons = c("csv"))))
## R version 3.5.1 (2018-07-02)
## Platform: x86_64-conda_cos6-linux-gnu (64-bit)
## Running under: Red Hat Enterprise Linux
##
## Matrix products: default
## BLAS/LAPACK: /gpfs/fs2/scratch/sgoldman_lab/.conda/envs/scRNA_v3.2/lib/R/lib/libRblas.so
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=en_US.UTF-8
## [9] LC_ADDRESS=en_US.UTF-8 LC_TELEPHONE=en_US.UTF-8
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=en_US.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] dplyr_0.8.3 knitr_1.26 DT_0.10 ggplot2_3.2.1
## [5] sctransform_0.2.0 Seurat_3.1.1.9002
##
## loaded via a namespace (and not attached):
## [1] tsne_0.1-3 nlme_3.1-142 bitops_1.0-6
## [4] RcppAnnoy_0.0.14 RColorBrewer_1.1-2 httr_1.4.1
## [7] tools_3.5.1 backports_1.1.5 R6_2.4.1
## [10] irlba_2.3.3 KernSmooth_2.23-16 uwot_0.1.5
## [13] lazyeval_0.2.2 colorspace_1.4-1 withr_2.1.2
## [16] npsurv_0.4-0 gridExtra_2.3 tidyselect_0.2.5
## [19] compiler_3.5.1 formatR_1.7 plotly_4.9.1
## [22] labeling_0.3 caTools_1.17.1.3 scales_1.1.0
## [25] lmtest_0.9-37 ggridges_0.5.1 pbapply_1.4-2
## [28] stringr_1.4.0 digest_0.6.23 rmarkdown_1.18
## [31] R.utils_2.9.1 pkgconfig_2.0.3 htmltools_0.4.0
## [34] bibtex_0.4.2 fastmap_1.0.1 htmlwidgets_1.5.1
## [37] rlang_0.4.2 shiny_1.4.0 farver_2.0.1
## [40] zoo_1.8-6 jsonlite_1.6 crosstalk_1.0.0
## [43] ica_1.0-2 gtools_3.8.1 R.oo_1.23.0
## [46] magrittr_1.5 Matrix_1.2-18 Rcpp_1.0.3
## [49] munsell_0.5.0 ape_5.3 reticulate_1.13
## [52] lifecycle_0.1.0 R.methodsS3_1.7.1 stringi_1.4.3
## [55] yaml_2.2.0 gbRd_0.4-11 MASS_7.3-51.4
## [58] gplots_3.0.1.1 Rtsne_0.15 plyr_1.8.4
## [61] grid_3.5.1 promises_1.1.0 parallel_3.5.1
## [64] gdata_2.18.0 listenv_0.8.0 ggrepel_0.8.1
## [67] crayon_1.3.4 lattice_0.20-38 cowplot_1.0.0
## [70] splines_3.5.1 SDMTools_1.1-221.2 zeallot_0.1.0
## [73] pillar_1.4.2 igraph_1.2.4.2 reshape2_1.4.3
## [76] future.apply_1.3.0 codetools_0.2-16 leiden_0.3.1
## [79] glue_1.3.1 evaluate_0.14 lsei_1.2-0
## [82] metap_1.1 RcppParallel_4.4.4 data.table_1.12.6
## [85] httpuv_1.5.2 vctrs_0.2.0 png_0.1-7
## [88] Rdpack_0.11-0 gtable_0.3.0 RANN_2.6.1
## [91] purrr_0.3.3 tidyr_1.0.0 future_1.15.1
## [94] assertthat_0.2.1 xfun_0.11 mime_0.7
## [97] rsvd_1.0.2 xtable_1.8-4 later_1.0.0
## [100] survival_3.1-8 viridisLite_0.3.0 tibble_2.1.3
## [103] cluster_2.1.0 globals_0.12.4 fitdistrplus_1.0-14
## [106] ROCR_1.0-7